{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split,cross_val_score,KFold\n",
    "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn import metrics\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "train_df=pd.read_csv('./titanic/train.csv')\n",
    "test_df=pd.read_csv('./titanic/test.csv')\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "#重复上一节的操作...\n",
    "train_df['Cabin'].fillna('missing',inplace=True)\n",
    "test_df['Cabin'].fillna('missing',inplace=True)\n",
    "train_df['Embarked'].fillna(train_df['Embarked'].mode()[0],inplace=True)\n",
    "train_df['Age'].fillna(train_df['Age'].mean(),inplace=True)\n",
    "test_df['Age'].fillna(train_df['Age'].mean(),inplace=True)\n",
    "test_df['Fare'].fillna(train_df['Fare'].mean(),inplace=True)\n",
    "import category_encoders as ce\n",
    "del train_df['Name']\n",
    "del train_df['Ticket']\n",
    "del test_df['Name']\n",
    "del test_df['Ticket']\n",
    "del train_df['PassengerId']\n",
    "del test_df['PassengerId']\n",
    "label=train_df[\"Survived\"]\n",
    "del train_df[\"Survived\"]\n",
    "# target \n",
    "target_encoder = ce.TargetEncoder(cols=['Embarked','Cabin']).fit(train_df,label)\n",
    "train_df=target_encoder.transform(train_df)\n",
    "test_df=target_encoder.transform(test_df)\n",
    "\n",
    "# one hot\n",
    "onehot_encoder = ce.OneHotEncoder(cols=['Sex']).fit(train_df)\n",
    "train_df=onehot_encoder.transform(train_df)\n",
    "test_df=onehot_encoder.transform(test_df)\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer\n",
    "#z-score归一化为例\n",
    "standard_scaler=StandardScaler()\n",
    "standard_scaler.fit(train_df)\n",
    "new_train_df=pd.DataFrame(standard_scaler.transform(train_df),columns=train_df.columns)\n",
    "new_test_df=pd.DataFrame(standard_scaler.transform(test_df),columns=train_df.columns)\n",
    "\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "poly=PolynomialFeatures(degree=2,include_bias=False,interaction_only=False)#\n",
    "poly_fea_np=poly.fit_transform(train_df)\n",
    "poly_fea_df=pd.DataFrame(poly_fea_np,columns=poly.get_feature_names())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "模型的优化，可以从两方面考虑：\n",
    "\n",
    "（1）单模型优化：超参搜索；\n",
    "\n",
    "（2）多模型集成：集成学习；"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 一.超参数搜索\n",
    "超参是指需要人为设定的参数，比如前面gbdt中的n_estimators,max_depth,learning_rate等；目前常见的超参搜索有网格搜索、随机搜索、贝叶斯优化搜索，还有基于强化学习的，比如google vizier...，其实比较好的方法是“人工智能”搜索（只需要一个excel表，并记录到相关操作对结果的改变就好了<坏结果也要保留>）..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 网格搜索  \n",
    "\n",
    "网格搜索有个缺点那就是很容易有“漏网之鱼”，网格搜索的参数都位于“交点”上，而最优解不一定落在这上面"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv='warn', error_score='raise-deprecating',\n",
       "             estimator=GradientBoostingClassifier(criterion='friedman_mse',\n",
       "                                                  init=None, learning_rate=0.1,\n",
       "                                                  loss='deviance', max_depth=3,\n",
       "                                                  max_features=None,\n",
       "                                                  max_leaf_nodes=None,\n",
       "                                                  min_impurity_decrease=0.0,\n",
       "                                                  min_impurity_split=None,\n",
       "                                                  min_samples_leaf=1,\n",
       "                                                  min_samples_split=2,\n",
       "                                                  min_weight_fraction_leaf=0.0,\n",
       "                                                  n_estimators=100,\n",
       "                                                  n_iter_no_change=None,\n",
       "                                                  presort='auto',\n",
       "                                                  random_state=None,\n",
       "                                                  subsample=1.0, tol=0.0001,\n",
       "                                                  validation_fraction=0.1,\n",
       "                                                  verbose=0, warm_start=False),\n",
       "             iid='warn', n_jobs=None,\n",
       "             param_grid={'learning_rate': [0.1, 0.15, 0.2],\n",
       "                         'max_depth': [3, 4, 5],\n",
       "                         'n_estimators': [50, 80, 100, 150]},\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring='f1', verbose=0)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "#定义搜索空间\n",
    "gdbt_parameters = {'max_depth': [3,4,5],'learning_rate':[0.1,0.15,0.2],'n_estimators':[50,80,100,150]}\n",
    "#定义模型\n",
    "gbdt=GradientBoostingClassifier()\n",
    "#进行搜索\n",
    "grid = GridSearchCV(gbdt, gdbt_parameters,scoring='f1')\n",
    "grid.fit(poly_fea_df, label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7657182671569628,\n",
       " {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid.best_score_,grid.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7771387926391069, 0.052881600589403784)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier=GradientBoostingClassifier(n_estimators=150,max_depth=3,learning_rate=0.2)\n",
    "scores = cross_val_score(classifier,poly_fea_df,label, scoring='f1', cv = 5)\n",
    "np.mean(scores),np.std(scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 随机搜索\n",
    "随机搜索会在超参数空间内生成很多随机的点，然后利用这些点的超参进行模型训练\n",
    "\n",
    "更多：https://blog.csdn.net/qq_36810398/article/details/86699842"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv='warn', error_score='raise-deprecating',\n",
       "                   estimator=GradientBoostingClassifier(criterion='friedman_mse',\n",
       "                                                        init=None,\n",
       "                                                        learning_rate=0.1,\n",
       "                                                        loss='deviance',\n",
       "                                                        max_depth=3,\n",
       "                                                        max_features=None,\n",
       "                                                        max_leaf_nodes=None,\n",
       "                                                        min_impurity_decrease=0.0,\n",
       "                                                        min_impurity_split=None,\n",
       "                                                        min_samples_leaf=1,\n",
       "                                                        min_samples_split=2,\n",
       "                                                        min_weight_fraction_leaf=0.0,\n",
       "                                                        n_estimators=100,\n",
       "                                                        n_iter_no_change=None,\n",
       "                                                        presort='auto',\n",
       "                                                        random_state=None,\n",
       "                                                        subsample=1.0,\n",
       "                                                        tol=0.0001,\n",
       "                                                        validation_fraction=0.1,\n",
       "                                                        verbose=0,\n",
       "                                                        warm_start=False),\n",
       "                   iid='warn', n_iter=10, n_jobs=None,\n",
       "                   param_distributions={'learning_rate': [0.1, 0.15, 0.2],\n",
       "                                        'max_depth': [3, 4, 5],\n",
       "                                        'n_estimators': [50, 80, 100, 150]},\n",
       "                   pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "                   return_train_score=False, scoring='f1', verbose=0)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "#定义搜索空间\n",
    "gdbt_parameters = {'max_depth': [3,4,5],'learning_rate':[0.1,0.15,0.2],'n_estimators':[50,80,100,150]}\n",
    "#定义模型\n",
    "gbdt=GradientBoostingClassifier()\n",
    "#进行搜索\n",
    "random_search = RandomizedSearchCV(gbdt, gdbt_parameters,scoring='f1')\n",
    "random_search.fit(poly_fea_df, label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 80}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7796924064819074, 0.037484958581777465)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier=GradientBoostingClassifier(n_estimators=80,max_depth=3,learning_rate=0.1)\n",
    "scores = cross_val_score(classifier,poly_fea_df,label, scoring='f1', cv = 5)\n",
    "np.mean(scores),np.std(scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 贝叶斯优化\n",
    "这里推荐使用Hyperopt工具  \n",
    "更多：https://www.jianshu.com/p/35eed1567463|"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████| 300/300 [04:01<00:00,  1.53it/s, best loss: -0.797873471113155]\n",
      "best: {'learning_rate': 1, 'max_depth': 0, 'n_estimators': 1}\n"
     ]
    }
   ],
   "source": [
    "from hyperopt import fmin, tpe, hp,STATUS_OK,Trials\n",
    "\n",
    "#定义loss函数\n",
    "def hyperopt_train_test(params):\n",
    "    clf = GradientBoostingClassifier(**params)\n",
    "    return cross_val_score(clf, poly_fea_df,label,cv=5,scoring='f1').mean()\n",
    "#定义搜索空间\n",
    "space4gbdt = {\n",
    "    'max_depth': hp.choice('max_depth', [3,4,5]),\n",
    "    'n_estimators': hp.choice('n_estimators', [50,80,100,150]),\n",
    "    'learning_rate': hp.choice('learning_rate', [0.1,0.15,0.2])\n",
    "}\n",
    "#定义优化目标-最小化-f1\n",
    "def f(params):\n",
    "    f1 = hyperopt_train_test(params)\n",
    "    return {'loss': -f1, 'status': STATUS_OK}\n",
    "#查找最佳参数\n",
    "trials = Trials()\n",
    "best = fmin(f, space4gbdt, algo=tpe.suggest, max_evals=300, trials=trials)\n",
    "print('best:',best)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7776312721540197, 0.03726768752987142)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier=GradientBoostingClassifier(n_estimators=50,max_depth=3,learning_rate=0.2)\n",
    "scores = cross_val_score(classifier,poly_fea_df,label, scoring='f1', cv = 5)\n",
    "np.mean(scores),np.std(scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 二.集成学习\n",
    "最后我们还可以将多个模型的输出结果进行集成，常见的bagging(代表是rf),boosting(代表是gbdt)；另外gbdt的多种实现版本，大家可以在各种竞赛(特别是kaggle)中经常见到，比如xgboost,lightgbm,catboost等，这里我介绍另外一种比较暴力的集成学习方法：**stacking**，它将模型的预测结果作为上一层模型的特征输入，结构如图：  \n",
    "![avatar](./source/stacking.jpg)\n",
    "\n",
    "更多： https://github.com/zhulei227/Stacking_Ensembles  \n",
    "更多stacking集成工具：https://www.jianshu.com/p/59313f43916f"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from stacking_classifier import *\n",
    "#定义模型结构\n",
    "classifier = StackingClassifier(\n",
    "    base_classifiers=[\n",
    "        RandomForestClassifier(),\n",
    "        AdaBoostClassifier(),\n",
    "        BaggingClassifier(),\n",
    "        GradientBoostingClassifier(),\n",
    "        LightGBMClassifier(),\n",
    "        SVMClassifier(),\n",
    "        NaiveBayesClassifier(),\n",
    "    ],\n",
    "    meta_classifier=LogisticRegression(),\n",
    "    subsample_features_rate=0.9,\n",
    "    n_jobs=-1\n",
    ")\n",
    "classifier.build_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8159999999999998"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train,X_test, y_train, y_test =train_test_split(poly_fea_df, label,test_size=0.2)\n",
    "classifier.fit(X_train,y_train)\n",
    "y_predict=classifier.predict(X_test)\n",
    "f1_score=metrics.f1_score(y_test,y_predict)\n",
    "f1_score"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
